knitr::opts_knit$set(root.dir = "/home/francescoc/Desktop/NodeVerse/data", 
  warning = FALSE, 
  message = FALSE, 
  error = FALSE)

Load Libraries

library(biomaRt)
library(DT)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks biomaRt::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## 
## The following object is masked from 'package:DT':
## 
##     JS
## 
## The following objects are masked from 'package:base':
## 
##     intersect, t
## 
## 
## Attaching package: 'Seurat'
## 
## The following object is masked from 'package:DT':
## 
##     JS
library(STRINGdb)

library(NodeVerse)
## Warning: replacing previous import 'igraph::components' by 'Seurat::components'
## when loading 'NodeVerse'

Load Atlas Data

options(timeout = 600)

# PBMC Dataset: Local and systemic responses to SARS-CoV-2 infection in children and adults
seurat_url <- "https://datasets.cellxgene.cziscience.com/89619149-162f-4839-8e97-24735924417c.rds"

seurat_object <- download_Atlas(seurat_url)

seurat_subset <- subset(
  x = seurat_object,
  subset = disease == "normal" & cell_type == "CD4-positive helper T cell" & donor_id %in% c("AN6", "AN9", "NP18")
)
meta_features <- seurat_subset[["RNA"]]@meta.features
stopifnot(all(rownames(meta_features) == rownames(seurat_subset[["RNA"]]@data)))

new_rownames <- meta_features$name
new_rownames <- seurat_subset@assays$RNA@meta.features$name

if (length(new_rownames) != nrow(seurat_subset[["RNA"]]@data)) {
  stop("The length of new_rownames does not match the number of features.")
}

assay <- seurat_subset[["RNA"]] # Access the RNA assay
rownames(assay@counts) <- new_rownames
rownames(assay@data) <- new_rownames
rownames(assay@meta.features) <- new_rownames

seurat_subset[["RNA"]] <- assay

Filtering step

seurat_subset <- PercentageFeatureSet(seurat_subset, "^MT-", col.name = "percent_mito")
# Ribosomal
seurat_subset <- PercentageFeatureSet(seurat_subset, "^RP[SL]", col.name = "percent_ribo")

feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(seurat_subset, group.by = "donor_id", split.by = "donor_id", features = feats, pt.size = 0.1, ncol = 3)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.

FeatureScatter(seurat_subset, "nCount_RNA", "nFeature_RNA", group.by = "donor_id", pt.size = .5)

selected_c <- WhichCells(seurat_subset, expression = nFeature_RNA > 200)
selected_f <- rownames(seurat_subset)[Matrix::rowSums(seurat_subset) > 3]

seurat_subset_filt <- subset(seurat_subset, features = selected_f, cells = selected_c)
dim(seurat_subset_filt)
## [1] 12342  1431
table(seurat_subset_filt$donor_id)
## 
##  AN1  AN2  AN3  AN5  AN6  AN7  AN9 AN11 AN12 AN13 AN14  AP1  AP2  AP3  AP4  AP5 
##    0    0    0    0  495    0  435    0    0    0    0    0    0    0    0    0 
##  AP6  AP7  AP8  AP9 AP10 AP11 AP12 AP13 NP13 NP15 NP16 NP17 NP18 NP19 NP20 NP21 
##    0    0    0    0    0    0    0    0    0    0    0    0  501    0    0    0 
## NP22 NP23 NP24 NP26 NP27 NP28 NP30 NP31 NP32 NP35 NP36 NP37 NP38 NP39 NP41 NP44 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  PC2  PC5  PC6  PC9 PC10 PC11 PC17 PC18 PC19 PC21 PC24 PC26 PC27 PC29  PP1  PP2 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  PP3  PP4  PP5  PP6  PP8  PP9 PP11 PP13 PP14 PP15 PP16 
##    0    0    0    0    0    0    0    0    0    0    0
C <- seurat_subset_filt@assays$RNA@counts
C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])),
    cex = 0.1, las = 1, xlab = "Percent counts per cell",
    col = (scales::hue_pal())(20)[20:1], horizontal = TRUE
)

selected_mito <- WhichCells(seurat_subset_filt, expression = percent_mito < 20)
selected_ribo <- WhichCells(seurat_subset_filt, expression = percent_ribo > 5)

# and subset the object to only keep those cells
seurat_subset_filt <- subset(seurat_subset_filt, cells = selected_mito)
seurat_subset_filt <- subset(seurat_subset_filt, cells = selected_ribo)
dim(seurat_subset_filt)
## [1] 12342  1431
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(seurat_subset_filt, group.by = "donor_id", features = feats, pt.size = 0.1, ncol = 3) + NoLegend()

# Filter MALAT1
seurat_subset_filt <- seurat_subset_filt[!grepl("MALAT1", rownames(seurat_subset_filt)), ]

# Filter Mitocondrial
seurat_subset_filt <- seurat_subset_filt[!grepl("^MT-", rownames(seurat_subset_filt)), ]

# Filter Ribossomal gene (optional if that is a problem on your data)
seurat_subset_filt <- seurat_subset_filt[ ! grepl("^RP[SL]", rownames(seurat_subset_filt)), ]

dim(seurat_subset_filt)
## [1] 12233  1431
C <- seurat_subset_filt@assays$RNA@counts
C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])),
    cex = 0.1, las = 1, xlab = "Percent counts per cell",
    col = (scales::hue_pal())(20)[20:1], horizontal = TRUE
)

Adjacency matrix

pathgenes <- pathg(seurat_object = seurat_subset_filt, cell_type = "CD4-positive helper T cell", top_n = 1200)
## Top 1200 genes selected based on mean expression.
options(timeout = 2000)
result <- stringdb_adjacency(
  genes          = pathgenes,
  species        = 9606,
  required_score = 900,
  keep_all_genes = F
)
## Initializing STRINGdb...
## Mapping genes to STRING IDs...
## Warning:  we couldn't map to STRING 1% of your identifiers
## Mapped 1185 genes to STRING IDs.
## Retrieving **physical** interactions from STRING API...
## Found 1419 STRING physical interactions.
## Adjacency matrices constructed successfully.
wadjm <- result$weighted
adjm <- result$binary

common_names <- intersect(rownames(adjm), colnames(adjm))
adjm <- adjm[common_names, common_names, drop = FALSE]

sorted_names <- sort(rownames(adjm))
adjm <- adjm[sorted_names, sorted_names]

print(dim(wadjm))
## [1] 554 554
print(dim(adjm))
## [1] 554 554
write.table(adjm, "./../analysis/adjm_p677.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = TRUE)
write.table(wadjm, "./../analysis/wadjm_p677.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = TRUE)

wadjm %>%
    datatable(extensions = 'Buttons',
            options = list(
              dom = 'Bfrtip',
              buttons = c('csv', 'excel'),
              scrollX = TRUE,
              pageLength = 10), 
            caption = "BioGRID Adjacency Top1000 genes")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
adjm %>%
    datatable(extensions = 'Buttons',
            options = list(
              dom = 'Bfrtip',
              buttons = c('csv', 'excel'),
              scrollX = TRUE,
              pageLength = 10), 
            caption = "BioGRID Adjacency Top1000 genes")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Split data for donor_id

genes_to_keep <- intersect(pathgenes, rownames(seurat_subset_filt))
seurat_subset_filt <- subset(seurat_subset_filt, features = genes_to_keep)

saveRDS(seurat_subset_filt, "PBMC.top1000.RDS")

Create Ground Truth Network

gtruth <- igraph::graph_from_adjacency_matrix(adjm, mode = "undirected", diag = FALSE)

num_nodes <- igraph::vcount(gtruth)
num_edges <- igraph::ecount(gtruth)

set.seed(1234)

p1 <- ggraph::ggraph(gtruth, layout = "fr") + 
  ggraph::geom_edge_link(color = "gray", width = 0.5) +  # Set edge color and width
  ggraph::geom_node_point(color = "steelblue", size = 0.7) +  # Set node color and size
  labs(title = paste("Ground Truth\nNodes:", igraph::vcount(gtruth), "Edges:", igraph::ecount(gtruth))) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  )

p1

ggsave("./../analysis/gtruth_top1000_s300.png", p1, width = 8, height = 6, dpi = 300, bg = "white")

Simulate n500 p677 matrices

ncell <- 500
nodes <- nrow(adjm)

set.seed(1130)
mu_values <- c(3, 6, 9)
theta_values <- c(1, 0.7, 0.5)

count_matrices <- lapply(1:3, function(i) {
  set.seed(1130 + i)
  mu_i <- mu_values[i]
  theta_i <- theta_values[i]
  
  count_matrix_i <- learn2count::simdata(n = ncell, p = nodes, B = adjm, family = "ZINB", 
                            mu = mu_i, mu_noise = 1, theta = theta_i, pi = 0.2)
  
  count_matrix_df <- as.data.frame(count_matrix_i)
  colnames(count_matrix_df) <- colnames(adjm)
  rownames(count_matrix_df) <- paste("cell", 1:nrow(count_matrix_df), sep = "")
  
  return(count_matrix_df)
})

saveRDS(count_matrices, "./../analysis/sim_n500p677.RDS")

count_matrices[[1]] %>%
    datatable(extensions = 'Buttons',
            options = list(
              dom = 'Bfrtip',
              buttons = c('csv', 'excel'),
              scrollX = TRUE,
              pageLength = 10), 
            caption = "Simulated Matrix")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
library(distributions3)
## 
## Attaching package: 'distributions3'
## The following object is masked from 'package:stats':
## 
##     Gamma
## The following object is masked from 'package:grDevices':
## 
##     pdf
#' Generate multiple zero-inflated negative binomial count matrices with cell-specific depth
#'
#' Simulates zero-inflated negative binomial (ZINB) data using an adjacency matrix,
#' generating `kmat` different matrices with cell-specific sequencing depth.
#'
#' @param n Integer. Number of samples (cells).
#' @param p Integer. Number of variables (genes).
#' @param B Matrix. A symmetric adjacency matrix (binary: 0/1), with row and column names as gene names.
#' @param mu_range List of numeric vectors. Each element is a numeric vector of length 2 specifying `mu` range per matrix.
#' @param mu_noise Numeric vector of length `kmat`. Mean of the noise component for each matrix.
#' @param theta Numeric vector of length `kmat`. Dispersion parameter of the negative binomial distribution for each matrix.
#' @param pi Numeric vector of length `kmat`. Probability of excess zeros (0 < pi < 1) for each matrix.
#' @param kmat Integer. Number of matrices to generate.
#' @param depth_range Numeric vector of length 2 or NA. If provided, defines the range of total counts per cell (e.g., `c(500, 5000)`). Default: `NA` (no depth scaling).
#' @param seed Integer (optional). Random seed for reproducibility.
#' 
#' @return A list of `kmat` numeric matrices, each of dimension `n x p`, where row names correspond to gene names from `B`.
#' @importFrom distributions3 rzinbinom
#' @export
zinb_simdata <- function(n, p, B, mu_range, mu_noise, theta, pi, kmat = 1, depth_range = NA, seed = NULL) {
  
  if (!is.null(seed)) set.seed(seed)
  
  stopifnot(is.numeric(n), n > 0, floor(n) == n)
  stopifnot(is.numeric(p), p > 0, floor(p) == p)
  stopifnot(is.matrix(B), nrow(B) == ncol(B))
  stopifnot(all(B %in% c(0, 1)))
  stopifnot(is.numeric(kmat), kmat > 0, floor(kmat) == kmat)
  stopifnot(length(mu_range) == kmat, all(sapply(mu_range, function(x) length(x) == 2 && all(x > 0))))
  stopifnot(length(mu_noise) == kmat, all(mu_noise >= 0))
  stopifnot(length(theta) == kmat, all(theta > 0))
  stopifnot(length(pi) == kmat, all(pi > 0 & pi < 1))
  
  # Validate depth_range only if it's not NA
  if (!is.na(depth_range[1])) {
    stopifnot(is.numeric(depth_range), length(depth_range) == 2, all(depth_range > 0), depth_range[1] < depth_range[2])
  }

  gene_names <- rownames(B)
  cellID <- paste0("cell_", seq_len(n))
  
  B <- ifelse(B > 0, 1, 0)
  
  edges <- which(B == 1, arr.ind = TRUE)
  edges <- edges[edges[, 1] < edges[, 2], ]  # Remove duplicate edges
  
  A <- diag(1, nrow = p, ncol = p)
  for (i in seq_len(nrow(edges))) {
    tmp <- rep(0, p)
    tmp[edges[i, ]] <- 1
    A <- cbind(A, tmp)
  }
  
  B[edges] <- sample(seq(min(unlist(mu_range)), max(unlist(mu_range))), length(edges[, 1]), replace = TRUE)
  B <- (B | t(B)) * 1  # Ensure symmetry while keeping it binary
  
  matrices <- vector("list", kmat)
  
  for (k in seq_len(kmat)) {
    mu <- runif(p, mu_range[[k]][1], mu_range[[k]][2])  # More efficient sampling
    
    sigma <- B  
    nonzero_sigma <- sigma[lower.tri(sigma) & sigma != 0]
    Y_mu <- c(mu, nonzero_sigma)  
    
    Y <- matrix(rzinbinom(length(Y_mu) * n, mu = rep(Y_mu, each = n), theta = theta[k], pi = pi[k]), 
                nrow = length(Y_mu), ncol = n)
    X <- A %*% Y
    
    noise_matrix <- matrix(rzinbinom(n * p, mu = mu_noise[k], theta = 1, pi = pi[k]), nrow = p, ncol = n)
    X <- X + noise_matrix
    
    X <- t(X)
    if (!is.null(gene_names)) colnames(X) <- gene_names
    rownames(X) <- cellID
    
    # Apply sequencing depth scaling if depth_range is provided
    if (!is.na(depth_range[1])) {
      cell_depths <- runif(n, min = depth_range[1], max = depth_range[2])
      
      row_sums <- rowSums(X)
      row_sums[row_sums == 0] <- 1  # Prevent division by zero
      
      X <- sweep(X, 1, row_sums, FUN = "/")
      X[is.na(X)] <- 0  # Handle zero rows
      
      X <- sweep(X, 1, cell_depths, FUN = "*")
      X <- round(X)
    }
    
    matrices[[k]] <- X
  }
  
  return(matrices)
}
simat <- zinb_simdata(
  n = ncell, 
  p = nodes, 
  B = adjm, 
  mu_range = list(c(1, 4), c(1, 7), c(1, 10)),
  mu_noise = c(1, 3, 5),  
  theta = c(1, 0.7, 0.5),  
  pi = c(0.2, 0.2, 0.2),  
  kmat = 3,
  depth_range = c(3000, 5000)
)

saveRDS(simat, "./../analysis/sim_n500p677.RDS")

simat[[1]] %>%
    datatable(extensions = 'Buttons',
            options = list(
              dom = 'Bfrtip',
              buttons = c('csv', 'excel'),
              scrollX = TRUE,
              pageLength = 10), 
            caption = "Simulated Matrix 1")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html